Load libraries and data

easypackages::libraries("here","ggplot2","caret","e1071","pheatmap","reshape2","NbClust","grid","patchwork","readxl","patchwork","WGCNA","psych","nlme")
source(here("code","ndar_functions.R"))
source(here("code","euaims_functions.R"))
source(here("code","get_ggColorHue.R"))

options(stringsAsFactors = FALSE)

# Z-score threshold to use for subtyping
z_thresh = 1

codepath = here("code")
datapath = here("data")
figpath = here("figures")
resultpath = here("results","ndar")
plotpath = here("plots","ndar")

# read in data
Dverbal_Discovery = read.csv(file.path(datapath,"tidy_verbal_disc.csv"))
Dverbal_Replication = read.csv(file.path(datapath,"tidy_verbal_rep.csv"))

Subtyping using Z-score of the difference between SC and RRB

make_subtype <- function(data2use, z_thresh, mean2use=NULL, sd2use=NULL){
  # compute difference score
  vars2use = c("dbaes_atotal","dbaes_btotal")
  diff_score = data2use[,vars2use[1]] - data2use[,vars2use[2]]
  
  # compute mean and sd if necessary
  if (is.null(mean2use)){
    mean2use = mean(diff_score)
  } # if (is.null(mean2use))
  
  if (is.null(sd2use)){
    sd2use = sd(diff_score)
  } # if (is.null(sd2use))
  
  # compute z-score
  data2use$z_ds = (diff_score - mean2use)/sd2use
  
  # make subtype factor
  data2use$z_ds_group = "SC_equal_RRB"
  data2use$z_ds_group[data2use$z_ds>z_thresh] = "SC_over_RRB"
  data2use$z_ds_group[data2use$z_ds<(z_thresh*-1)] = "RRB_over_SC"
  data2use$z_ds_group = factor(data2use$z_ds_group)
  return(data2use)
  
} # function make_subtype

vars2use = c("dbaes_atotal","dbaes_btotal")

# compute Discovery mean and sd
ds_disc = Dverbal_Discovery[,vars2use[1]] - Dverbal_Discovery[,vars2use[2]]
mean2use = mean(ds_disc) 
sd2use = sd(ds_disc)

Dverbal_Discovery = make_subtype(data2use = Dverbal_Discovery,
                                 z_thresh = z_thresh,
                                 mean2use = mean2use,
                                 sd2use = sd2use)

# compute Replication mean and sd
ds_rep = Dverbal_Replication[,vars2use[1]] - Dverbal_Replication[,vars2use[2]]
# mean2use = mean(ds_rep) 
# sd2use = sd(ds_rep)

Dverbal_Replication = make_subtype(data2use = Dverbal_Replication,
                                 z_thresh = z_thresh,
                                 mean2use = mean2use,
                                 sd2use = sd2use)

Make scatterplots with difference score Z subtypes in different colors

maxScores = c(3,4)
fontSize = 30

p_disc = ggplot(data = Dverbal_Discovery, aes(x = dbaes_atotal, y = dbaes_btotal, colour = z_ds_group)) + geom_point() + xlab("SC") + ylab("RRB") + ylim(0,1) + xlim(0,1) + ggtitle("NDAR Discovery") + 
  theme(text = element_text(size=fontSize), 
        axis.text.x = element_text(size=fontSize)) 
p1_top_left = p_disc + guides(colour=FALSE) 
ggsave(filename = file.path(plotpath, sprintf("final_NDAR_Disc_scatterplot_z%s.pdf",as.character(z_thresh))), plot = p1_top_left)
p_disc

table(Dverbal_Discovery$z_ds_group)
## 
##  RRB_over_SC SC_equal_RRB  SC_over_RRB 
##          141          611          137
cor_res = cor.test(Dverbal_Discovery$dbaes_atotal, Dverbal_Discovery$dbaes_btotal)
cor_res
## 
##  Pearson's product-moment correlation
## 
## data:  Dverbal_Discovery$dbaes_atotal and Dverbal_Discovery$dbaes_btotal
## t = 6.6829, df = 887, p-value = 4.134e-11
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1554332 0.2806578
## sample estimates:
##       cor 
## 0.2189469
p_rep = ggplot(data = Dverbal_Replication, aes(x = dbaes_atotal, y = dbaes_btotal, colour = z_ds_group)) + geom_point() + xlab("SC") + ylab("RRB") + ylim(0,1) + xlim(0,1) +  ggtitle("NDAR Replication") + 
  theme(text = element_text(size=fontSize), 
        axis.text.x = element_text(size=fontSize)) 
p2_bottom_left = p_rep + guides(colour=FALSE) 
ggsave(filename = file.path(plotpath, sprintf("final_NDAR_Rep_scatterplot_z%s.pdf",as.character(z_thresh))), plot = p2_bottom_left)
p_rep

table(Dverbal_Replication$z_ds_group)
## 
##  RRB_over_SC SC_equal_RRB  SC_over_RRB 
##          143          625          122
cor_res = cor.test(Dverbal_Replication$dbaes_atotal, Dverbal_Replication$dbaes_btotal)
cor_res
## 
##  Pearson's product-moment correlation
## 
## data:  Dverbal_Replication$dbaes_atotal and Dverbal_Replication$dbaes_btotal
## t = 7.1459, df = 888, p-value = 1.864e-12
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1700824 0.2943934
## sample estimates:
##       cor 
## 0.2331903

Run supervised model with Discovery as training and Replication as Test

# run validation
# make subtypes using z-scores computed from the mean and sd of the training set
train_data = Dverbal_Discovery
test_data = Dverbal_Replication

mean2use = mean(train_data[,vars2use[1]] - train_data[,vars2use[2]])
sd2use = sd(train_data[,vars2use[1]] - train_data[,vars2use[2]])
tmp_train = make_subtype(data2use = train_data,
                         z_thresh = z_thresh,
                         mean2use = mean2use,
                         sd2use = sd2use)

# mean2use = mean(test_data[,vars2use[1]] - test_data[,vars2use[2]])
# sd2use = sd(test_data[,vars2use[1]] - test_data[,vars2use[2]])
tmp_test = make_subtype(data2use = test_data,
                        z_thresh = z_thresh,
                        mean2use = mean2use,
                        sd2use = sd2use)
# compute model
mod2use = svm(x = tmp_train[,vars2use], y = tmp_train$z_ds_group)
pred_labels = predict(mod2use, tmp_test[,vars2use])
confmat = table(tmp_test$z_ds_group,pred_labels)
acc = (confmat[1,1]+confmat[2,2]+confmat[3,3])/length(pred_labels)

# plot confusion matrix
setHook("grid.newpage", function() pushViewport(viewport(x=1,y=1,width=0.9, height=0.9, name="vp", just=c("right","top"))), action="prepend")
pheatmap(confmat/rowSums(confmat)*100, display_numbers = confmat, color = colorRampPalette(c('white','red'))(100), cluster_rows = FALSE, cluster_cols = FALSE, fontsize_number = fontSize-10, fontsize_row = fontSize-10, fontsize_col = fontSize-10,labels_row = c("RRB>SC","SC=RRB","SC>RRB"),labels_col = c("RRB>SC","SC=RRB","SC>RRB"),angle_col=90,
         breaks= seq(0,100, length=100))
setHook("grid.newpage", NULL, "replace")
grid::grid.text("Actual Labels", y=-0.07, gp=gpar(fontsize=16))
grid::grid.text("Predicted Labels", x=-0.07, rot=90, gp=gpar(fontsize=16))

# show accuracy
true_accuracy = acc
true_accuracy
## [1] 0.9932584

Permute subtype labels to examine how well supervised model performs

nperm = 10000
# make subtypes using z-scores computed from the mean and sd of the training set
train_data = Dverbal_Discovery
test_data = Dverbal_Replication
mean2use = mean(train_data[,vars2use[1]] - train_data[,vars2use[2]])
sd2use = sd(train_data[,vars2use[1]] - train_data[,vars2use[2]])
tmp_train = make_subtype(data2use = train_data,
                         z_thresh = z_thresh,
                         mean2use = mean2use,
                         sd2use = sd2use)

# mean2use = mean(test_data[,vars2use[1]] - test_data[,vars2use[2]])
# sd2use = sd(test_data[,vars2use[1]] - test_data[,vars2use[2]])
tmp_test = make_subtype(data2use = test_data,
                        z_thresh = z_thresh,
                        mean2use = mean2use,
                        sd2use = sd2use)

acc = vector(length = nperm)
for (iperm in 1:nperm){
  # set seed for reproducibility
  set.seed(iperm)
  # compute model
  permuted_labels = sample(tmp_train$z_ds_group)
  mod2use = svm(x = tmp_train[,vars2use], y = permuted_labels)
  pred_labels = predict(mod2use, tmp_test[,vars2use])
  confmat = table(tmp_test$z_ds_group,pred_labels)
  acc[iperm] = (confmat[1,1]+confmat[2,2]+confmat[3,3])/length(pred_labels)
} # for (iperm in 1:nperm)

df2plot = data.frame(Accuracy = acc)
p = ggplot(data = df2plot, aes(x = Accuracy)) + geom_histogram() + geom_vline(xintercept=true_accuracy)
p

# compute p-value
pval = sum(c(true_accuracy,acc)>=true_accuracy)/(nperm+1)
pval
## [1] 9.999e-05

Plot difference score Z subtypes in Discovery set

maxScores = c(3,4)

# Discovery - make plot with all individuals shown as lines
df2use = Dverbal_Discovery[,c("subjectkey",adi_total_vars2use)]
df2use$subgrp = factor(tmp_train$z_ds_group)
df2use = data.frame(df2use)
df2use$subjectkey = factor(df2use$subjectkey)
df2use$SC = df2use$dbaes_atotal
df2use$RRB = df2use$dbaes_btotal

df4plot = melt(df2use,
               id.vars = c("subjectkey","subgrp"), 
               measure.vars = c("SC","RRB"))

p = ggplot(data = df4plot, aes(x = variable, 
                               y = value, 
                               colour = subgrp, 
                               group = subjectkey)) + facet_grid(. ~ subgrp)
p = p + geom_point(shape=1) + geom_line(alpha = 0.2) + ylim(0,1) + guides(color=FALSE)
p = p + ylab("Percent Severity") + xlab("ADI-R subscale") + 
  theme(text = element_text(size=fontSize-5), 
        axis.text.x = element_text(size=fontSize)) 
p3_middle_top = p + guides(colour=FALSE)
ggsave(filename = file.path(plotpath, sprintf("final_NDAR_Disc_jitterplot_z%s.pdf",as.character(z_thresh))), plot = p3_middle_top)
p

Plot difference score Z subtypes in Replication set

maxScores = c(3,4)

# Replication - make plot with all individuals shown as lines
df2use = Dverbal_Replication[,c("subjectkey",adi_total_vars2use)]
df2use$subgrp = factor(tmp_test$z_ds_group)
df2use = data.frame(df2use)
df2use$subjectkey = factor(df2use$subjectkey)
df2use$SC = df2use$dbaes_atotal
df2use$RRB = df2use$dbaes_btotal

df4plot = melt(df2use,
               id.vars = c("subjectkey","subgrp"), 
               measure.vars = c("SC","RRB"))

p = ggplot(data = df4plot, aes(x = variable, 
                               y = value, 
                               colour = subgrp, 
                               group = subjectkey)) + facet_grid(. ~ subgrp)
p = p + geom_point(shape=1) + geom_line(alpha = 0.2) + ylim(0,1) + guides(color=FALSE)
p = p + ylab("Percent Severity") + xlab("ADI-R subscale") + 
  theme(text = element_text(size=fontSize-5), 
        axis.text.x = element_text(size=fontSize))
p4_middle_bottom = p + guides(colour=FALSE)
ggsave(filename = file.path(plotpath, sprintf("final_NDAR_Rep_jitterplot_z%s.pdf",as.character(z_thresh))), plot = p4_middle_bottom)
p

Subtype using hierarchical clustering and dynamic hybrid tree cut algorithm to find the subtypes

NDAR Discovery

# deep split parameter
dS = 0
maxScores = c(3,4)

data2use = Dverbal_Discovery[,vars2use]

# discReorderedItems = colnames(data2use)
fname2save = file.path(plotpath,
                       sprintf("clustergram_ADIalgoTotals_verbalDiscovery_euclidean_ward_deepSplit%d.pdf",dS))
verbalDiscovery_clustResults = ClusterData(data2use,
                                     deepSplit=dS,
                                     fname2save = fname2save)
##  ..cutHeight not given, setting it to 43.9  ===>  99% of the (truncated) height range in dendro.
##  ..done.
oldColors = c("blue","brown","green","red","turquoise","yellow")
newColors = c("5","4","6","3","2","1")

verbalDiscovery_clustResults = relabelClusters(verbalDiscovery_clustResults, oldColors, newColors)
makeClustergram(verbalDiscovery_clustResults, fname2save = fname2save)
## quartz_off_screen 
##                 2
# make plot with all individuals shown as lines
df2use = Dverbal_Discovery[,c("subjectkey",vars2use)]
df2use$subgrp = factor(verbalDiscovery_clustResults$dynamicColors)
df2use = data.frame(df2use)
df2use$subjectkey = factor(df2use$subjectkey)
df2use$dbaes_atotal = df2use$dbaes_atotal
df2use$dbaes_btotal = df2use$dbaes_btotal
df4plot = melt(df2use,
               id.vars = c("subjectkey","subgrp"),
               measure.vars = c("dbaes_atotal","dbaes_btotal"))

p = ggplot(data = df4plot, aes(x = variable,
                               y = value,
                               colour = subgrp,
                               group = subjectkey)) + facet_grid(. ~ subgrp)
p = p + geom_point(shape=1) + geom_line(alpha = 0.2) + guides(color=FALSE) + ylim(0,1)
  # scale_colour_manual(values = colors2use) + ylim(0,1)
p = p + ylab("Percent Severity") + xlab("ADI-R subscale")
fname2save = file.path(plotpath,
                       sprintf("summaryPlot_IndividualSubs_ADIalgoTotals_verbalDiscovery_euclidean_ward_deepSplit%d.pdf",dS))
ggsave(filename = fname2save)
p

# scatterplot
p_disc = ggplot(data = df2use, aes(x = dbaes_atotal, y = dbaes_btotal, colour = factor(subgrp))) + geom_point() + xlab("Social-Communication") + ylab("Restricted Repetitive Behaviors") + ggtitle("NDAR Discovery") + ylim(0,1) + xlim(0,1)
p_disc

NDAR Replication

# deep split parameter
dS = 0
maxScores = c(3,4)

data2use = Dverbal_Replication[,vars2use]

# discReorderedItems = colnames(data2use)
fname2save = file.path(plotpath,
                       sprintf("clustergram_ADIalgoTotals_verbalReplication_euclidean_ward_deepSplit%d.pdf",dS))
verbalReplication_clustResults = ClusterData(data2use,
                                             deepSplit=dS,
                                             fname2save = fname2save)
##  ..cutHeight not given, setting it to 42.9  ===>  99% of the (truncated) height range in dendro.
##  ..done.
oldColors = c("black","blue","brown","green","red","turquoise","yellow")
newColors = c("1","3","4","5","2","7","6")

verbalReplication_clustResults = relabelClusters(verbalReplication_clustResults, oldColors, newColors)
makeClustergram(verbalReplication_clustResults, fname2save = fname2save)
## quartz_off_screen 
##                 2
# make plot with all individuals shown as lines
df2use = Dverbal_Replication[,c("subjectkey",vars2use)]
df2use$subgrp = factor(verbalReplication_clustResults$dynamicColors)
df2use = data.frame(df2use)
df2use$subjectkey = factor(df2use$subjectkey)
df2use$dbaes_atotal = df2use$dbaes_atotal
df2use$dbaes_btotal = df2use$dbaes_btotal

df4plot = melt(df2use,
               id.vars = c("subjectkey","subgrp"),
               measure.vars = c("dbaes_atotal","dbaes_btotal"))

p = ggplot(data = df4plot, aes(x = variable,
                               y = value,
                               colour = subgrp,
                               group = subjectkey)) + facet_grid(. ~ subgrp)
p = p + geom_point(shape=1) + geom_line(alpha = 0.2) + guides(color=FALSE) + ylim(0,1)
  # scale_colour_manual(values = colors2use) + ylim(0,1)
p = p + ylab("Percent Severity") + xlab("ADI-R subscale")
fname2save = file.path(plotpath,
                       sprintf("summaryPlot_IndividualSubs_ADIalgoTotals_verbalReplication_euclidean_ward_deepSplit%d.pdf",dS))
ggsave(filename = fname2save)
p

# scatterplot
p_rep = ggplot(data = df2use, aes(x = dbaes_atotal, y = dbaes_btotal, colour = factor(subgrp))) + geom_point() + xlab("Social-Communication") + ylab("Restricted Repetitive Behaviors") + ggtitle("NDAR Replication") + ylim(0,1) + xlim(0,1)
p_rep

Subtype using hierarchical agglomerative clustering, looking for k=3

Will use SC, RRB and the difference score as features

# how many clusters do you want?
nclusters = 3

# cluster with SC, RRB, and difference score
ds = Dverbal_Discovery[,vars2use[1]] - Dverbal_Discovery[,vars2use[2]]
# distance matrix
distmat = dist(x = cbind(Dverbal_Discovery[,vars2use], ds), method="euclidean")
# hierarchical clustering
disc_tree = hclust(d=distmat, method="ward.D2")
# cut the tree
treecut_res = cutree(tree=disc_tree, k=nclusters)
Dverbal_Discovery$hc_subgrp = treecut_res

ds = Dverbal_Replication[,vars2use[1]] - Dverbal_Replication[,vars2use[2]]
# distance matrix
distmat = dist(x = cbind(Dverbal_Replication[,vars2use], ds), method="euclidean")
# hierarchical clustering
disc_tree = hclust(d=distmat, method="ward.D2")
# cut the tree
treecut_res = cutree(tree=disc_tree, k=nclusters)
Dverbal_Replication$hc_subgrp = treecut_res

Plot Discovery dataset after using hierarchical agglomerative clustering

Discovery set

maxScores = c(3,4)

# Discovery - make plot with all individuals shown as lines
df2use = Dverbal_Discovery[,c("subjectkey",adi_total_vars2use)]
df2use$subgrp = factor(Dverbal_Discovery$hc_subgrp)
df2use = data.frame(df2use)
df2use$subjectkey = factor(df2use$subjectkey)
df2use$SC = df2use$dbaes_atotal
df2use$RRB = df2use$dbaes_btotal

df4plot = melt(df2use,
               id.vars = c("subjectkey","subgrp"),
               measure.vars = c("SC","RRB"))

p = ggplot(data = df4plot, aes(x = variable,
                               y = value,
                               colour = subgrp,
                               group = subjectkey)) + facet_grid(. ~ subgrp)
p = p + geom_point(shape=1) + geom_line(alpha = 0.2) + guides(color=FALSE)
p = p + ylab("Percent Severity") + xlab("ADI-R subscale")
p

# scatterplot
p_disc = ggplot(data = Dverbal_Discovery, aes(x = dbaes_atotal, y = dbaes_btotal, colour = factor(hc_subgrp))) + geom_point() + xlab("Social-Communication") + ylab("Restricted Repetitive Behaviors") + ggtitle("NDAR Discovery")
p_disc

table(Dverbal_Discovery$hc_subgrp)
## 
##   1   2   3 
## 339 225 325

Plot Replication dataset after using hierarchical agglomerative clustering

maxScores = c(3,4)

# Replication - make plot with all individuals shown as lines
df2use = Dverbal_Replication[,c("subjectkey",adi_total_vars2use)]
df2use$subgrp = factor(Dverbal_Replication$hc_subgrp)
df2use = data.frame(df2use)
df2use$subjectkey = factor(df2use$subjectkey)
df2use$SC = df2use$dbaes_atotal
df2use$RRB = df2use$dbaes_btotal

df4plot = melt(df2use,
               id.vars = c("subjectkey","subgrp"),
               measure.vars = c("SC","RRB"))

p = ggplot(data = df4plot, aes(x = variable,
                               y = value,
                               colour = subgrp,
                               group = subjectkey)) + facet_grid(. ~ subgrp)
p = p + geom_point(shape=1) + geom_line(alpha = 0.2) + guides(color=FALSE)
p = p + ylab("Percent Severity") + xlab("ADI-R subscale")
p

# scatterplot
p_rep = ggplot(data = Dverbal_Replication, aes(x = dbaes_atotal, y = dbaes_btotal, colour = factor(hc_subgrp))) + geom_point() + xlab("Social-Communication") + ylab("Restricted Repetitive Behaviors") + ggtitle("NDAR Replication")
p_rep

table(Dverbal_Replication$hc_subgrp)
## 
##   1   2   3 
## 357 248 285

Subtype using hierarchical agglomerative clustering, but use NbClust to find optimal number of clusters

Will use SC, RRB. NbClust won’t work if I give it the difference score

nbc_disc_res = NbClust(data = Dverbal_Discovery[,vars2use], method = "ward.D2")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 6 proposed 2 as the best number of clusters 
## * 3 proposed 3 as the best number of clusters 
## * 7 proposed 4 as the best number of clusters 
## * 1 proposed 5 as the best number of clusters 
## * 1 proposed 6 as the best number of clusters 
## * 1 proposed 8 as the best number of clusters 
## * 1 proposed 14 as the best number of clusters 
## * 3 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  4 
##  
##  
## *******************************************************************
nbc_rep_res = NbClust(data = Dverbal_Replication[,vars2use], method = "ward.D2")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 5 proposed 2 as the best number of clusters 
## * 6 proposed 3 as the best number of clusters 
## * 2 proposed 5 as the best number of clusters 
## * 5 proposed 7 as the best number of clusters 
## * 1 proposed 9 as the best number of clusters 
## * 1 proposed 13 as the best number of clusters 
## * 3 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************

Plot Discovery dataset after using hierarchical agglomerative clustering and NbClust

Discovery set

maxScores = c(3,4)

# Discovery - make plot with all individuals shown as lines
df2use = Dverbal_Discovery[,c("subjectkey",adi_total_vars2use)]
df2use$subgrp = factor(nbc_disc_res$Best.partition)
df2use = data.frame(df2use)
df2use$subjectkey = factor(df2use$subjectkey)
df2use$SC = df2use$dbaes_atotal
df2use$RRB = df2use$dbaes_btotal

df4plot = melt(df2use,
               id.vars = c("subjectkey","subgrp"),
               measure.vars = c("SC","RRB"))

p = ggplot(data = df4plot, aes(x = variable,
                               y = value,
                               colour = subgrp,
                               group = subjectkey)) + facet_grid(. ~ subgrp)
p = p + geom_point(shape=1) + geom_line(alpha = 0.2) + guides(color=FALSE)
p = p + ylab("Percent Severity") + xlab("ADI-R subscale")
p

# scatterplot
Dverbal_Discovery$nbclust_subgrp = factor(nbc_disc_res$Best.partition)
p_disc = ggplot(data = Dverbal_Discovery, aes(x = dbaes_atotal, y = dbaes_btotal, colour = factor(nbclust_subgrp))) + geom_point() + xlab("Social-Communication") + ylab("Restricted Repetitive Behaviors") + ggtitle("NDAR Discovery")
p_disc

table(Dverbal_Discovery$nbclust_subgrp)
## 
##   1   2   3   4 
## 149 369 171 200

Plot Replication dataset after using hierarchical agglomerative clustering

maxScores = c(3,4)

# Replication - make plot with all individuals shown as lines
df2use = Dverbal_Replication[,c("subjectkey",adi_total_vars2use)]
df2use$subgrp = factor(nbc_rep_res$Best.partition)
df2use = data.frame(df2use)
df2use$subjectkey = factor(df2use$subjectkey)
df2use$SC = df2use$dbaes_atotal
df2use$RRB = df2use$dbaes_btotal

df4plot = melt(df2use,
               id.vars = c("subjectkey","subgrp"),
               measure.vars = c("SC","RRB"))

p = ggplot(data = df4plot, aes(x = variable,
                               y = value,
                               colour = subgrp,
                               group = subjectkey)) + facet_grid(. ~ subgrp)
p = p + geom_point(shape=1) + geom_line(alpha = 0.2) + guides(color=FALSE)
p = p + ylab("Percent Severity") + xlab("ADI-R subscale")
p

# scatterplot
Dverbal_Replication$nbclust_subgrp = factor(nbc_rep_res$Best.partition)
p_rep = ggplot(data = Dverbal_Replication, aes(x = dbaes_atotal, y = dbaes_btotal, colour = factor(nbclust_subgrp))) + geom_point() + xlab("Social-Communication") + ylab("Restricted Repetitive Behaviors") + ggtitle("NDAR Replication")
p_rep

table(Dverbal_Replication$nbclust_subgrp)
## 
##   1   2   3 
## 450 353  87

Apply NDAR subtypes to EU-AIMS LEAP data

# make SC and RRB percentages since EU-AIMS data is specified as percentages
Dverbal = read.csv(file.path(datapath,"tidy_verbal.csv"))
euaims_data = read.csv(file.path(datapath,"tidy_euaims.csv"))
mask1 = euaims_data$Diagnosis=="ASD" 
mask2 =  (is.na(euaims_data$A1_pct_severity) | is.na(euaims_data$A2_pct_severity) | is.na(euaims_data$A3_pct_severity) | is.na(euaims_data$B1_pct_severity) | is.na(euaims_data$B2_pct_severity) | is.na(euaims_data$B3_pct_severity) | is.na(euaims_data$B4_pct_severity)) 
euaims_data = subset(euaims_data, (mask1 & !mask2))


Dverbal[,vars2use[1]] = Dverbal[,vars2use[1]]
Dverbal[,vars2use[2]] = Dverbal[,vars2use[2]]
euaims_data[,vars2use[1]] = (euaims_data$A1_pct_severity + 
                               euaims_data$A2_pct_severity + 
                               euaims_data$A3_pct_severity)/3
euaims_data[,vars2use[2]] = (euaims_data$B1_pct_severity + 
                               euaims_data$B2_pct_severity + 
                               euaims_data$B3_pct_severity + 
                               euaims_data$B4_pct_severity)/4

train_data = Dverbal
test_data = euaims_data

mean2use = mean(train_data[,vars2use[1]] - train_data[,vars2use[2]], na.rm=TRUE)
sd2use = sd(train_data[,vars2use[1]] - train_data[,vars2use[2]], na.rm=TRUE)
c(mean2use, sd2use)
## [1] -0.01045243  0.19482749
tmp_train = make_subtype(data2use = train_data,
                         z_thresh = z_thresh,
                         mean2use = mean2use,
                         sd2use = sd2use)

tmp_test = make_subtype(data2use = test_data,
                        z_thresh = z_thresh,
                        mean2use = mean2use,
                        sd2use = sd2use)

# compute model
mod2use = svm(x = tmp_train[,vars2use], y = tmp_train$z_ds_group)
pred_labels = predict(mod2use, tmp_test[,vars2use])
confmat = table(tmp_test$z_ds_group,pred_labels)
acc = (confmat[1,1]+confmat[2,2]+confmat[3,3])/length(pred_labels)

tmp_test$svm_pred_labels = pred_labels

# plot confusion matrix
setHook("grid.newpage", function() pushViewport(viewport(x=1,y=1,width=0.9, height=0.9, name="vp", just=c("right","top"))), action="prepend")
pheatmap(confmat/rowSums(confmat)*100, display_numbers = TRUE, color = colorRampPalette(c('white','red'))(100), cluster_rows = FALSE, cluster_cols = FALSE, fontsize_number = 15,labels_row = c("RRB>SC","SC=RRB","SC>RRB"),labels_col = c("RRB>SC","SC=RRB","SC>RRB"),angle_col=90)
setHook("grid.newpage", NULL, "replace")
grid::grid.text("Actual Labels", y=-0.07, gp=gpar(fontsize=16))
grid::grid.text("Predicted Labels", x=-0.07, rot=90, gp=gpar(fontsize=16))

# scatterplot
p1 = ggplot(data = tmp_train, aes(x = dbaes_atotal, y = dbaes_btotal, colour = factor(z_ds_group))) + geom_point() + xlab("Social-Communication") + ylab("Restricted Repetitive Behaviors") + ylim(0,0.8) + xlim(0,0.8) + ggtitle("NDAR ALL") + 
  theme(text = element_text(size=fontSize-5),axis.text.x = element_text(size=fontSize))
p1

p2 = ggplot(data = tmp_test, aes(x = dbaes_atotal, y = dbaes_btotal, colour = factor(z_ds_group))) + geom_point() + xlab("Social-Communication") + ylab("Restricted Repetitive Behaviors") + ylim(0,0.8) + xlim(0,0.8) + ggtitle("EU-AIMS with Groups from NDAR ALL") + 
  theme(text = element_text(size=fontSize-5), axis.text.x = element_text(size=fontSize))
p2

p3 = ggplot(data = tmp_test, aes(x = dbaes_atotal, y = dbaes_btotal, colour = factor(pred_labels))) + geom_point() + xlab("SC") + ylab("RRB") + ylim(0,0.8) + xlim(0,0.8) + ggtitle("EU-AIMS") + 
  theme(text = element_text(size=fontSize-5), axis.text.x = element_text(size=fontSize))
p5_bottom_right = p3 + guides(colour=FALSE)
ggsave(filename = file.path(plotpath, sprintf("final_EUAIMS_scatterplot_z%s.pdf",as.character(z_thresh))), plot = p5_bottom_right)
p3

# # write out EU-AIMS LEAP data with subgroups defined by NDAR All
write.csv(tmp_test, file.path(datapath, sprintf("tidy_euaims_NDAR_subtypes_diffscore_z%s.csv",as.character(z_thresh))))

Make final plot

p_final = p1_top_left + p3_middle_top + plot_spacer() + p2_bottom_left + p4_middle_bottom + p5_bottom_right + plot_layout(nrow=3, ncol=3, widths = c(4,4,4), heights = c(8,8,8))
ggsave(filename = file.path(plotpath, sprintf("final_NDAR_EUAIMS_subtypes_plot_z%s.pdf",as.character(z_thresh))), plot = p_final)
p_final

Integrate subtypes with rest of EU-AIMS LEAP data

euaims_data = read.csv(file.path(datapath,"tidy_euaims.csv"))
td_df = subset(euaims_data, euaims_data$Diagnosis=="TD",select=2:6)
td_df$A_pct_severity = as.numeric(NA)
td_df$B_pct_severity = as.numeric(NA)
td_df$subgrp = "TD"

cols2use = c("subid","age","Centre","Schedule","Diagnosis","dbaes_atotal","dbaes_btotal","svm_pred_labels")
tmp_asd = tmp_test[,cols2use]
colnames(tmp_asd)[6] = "A_pct_severity"
colnames(tmp_asd)[7] = "B_pct_severity"
colnames(tmp_asd)[8] = "subgrp"
asd_df = tmp_asd

all_data = rbind(td_df,asd_df)

fname = "/Users/mlombardo/Dropbox/euaims/data/rsfmri_preproc/euaims_preproc.xlsx"
pp_data = read_excel(fname)
mask = pp_data$notes=="ok"
pp_data = subset(pp_data, mask)

asd_df = merge(pp_data[,c("subid","sex")],asd_df, by = "subid")
td_df = merge(pp_data[,c("subid","sex")],td_df, by = "subid")

all_data = rbind(td_df,asd_df)

data2write = merge(pp_data, all_data, by = "subid")
data2write$age = data2write$age.x
data2write$sex = data2write$sex.y
print(table(data2write$Schedule, data2write$subgrp))
##    
##     RRB_over_SC SC_equal_RRB SC_over_RRB TD
##   A           4           52          24 78
##   B           0           54          31 83
##   C           3           39          21 59
##   D           0           20          18 23
print(table(data2write$Centre, data2write$subgrp))
##                
##                 RRB_over_SC SC_equal_RRB SC_over_RRB TD
##   CAMBRIDGE               2           33           7 29
##   KINGS_COLLEGE           4           59          43 78
##   MANNHEIM                0            0           0 34
##   NIJMEGEN                1           59          33 64
##   UTRECHT                 0           14          11 38
print(table(data2write$sex, data2write$subgrp))
##         
##          RRB_over_SC SC_equal_RRB SC_over_RRB  TD
##   Female           3           42          26  88
##   Male             4          123          68 155
#DSM-5 - find best split that balances participants across sites
a = findBestSplit(asd_df, seed_range = c(172342)) #,300001:500000))
## [1] 172342
best_seeds = a$seed[!is.na(a$discrepancy) & a$discrepancy==min(a$discrepancy, na.rm = TRUE)]
print(best_seeds)
## [1] 172342
# Split datasets -------------------------------------------------------------
rngSeed = best_seeds[1]

# split Schedule A dataset
dset2use = subset(asd_df, asd_df$Schedule=="A")
tmp_d = SplitDatasetsBySex(dset2use, rngSeed = rngSeed)
A_Discovery = tmp_d[[2]]
A_Replication = tmp_d[[1]]

# split Schedule B dataset
dset2use = subset(asd_df, asd_df$Schedule=="B")
tmp_d = SplitDatasetsBySex(dset2use, rngSeed = rngSeed)
B_Discovery = tmp_d[[2]]
B_Replication = tmp_d[[1]]

# split Schedule C dataset
dset2use = subset(asd_df, asd_df$Schedule=="C")
tmp_d = SplitDatasetsBySex(dset2use, rngSeed = rngSeed)
C_Discovery = tmp_d[[2]]
C_Replication = tmp_d[[1]]

# split Schedule D dataset
dset2use = subset(asd_df, asd_df$Schedule=="D")
tmp_d = SplitDatasetsBySex(dset2use, rngSeed = rngSeed)
D_Discovery = tmp_d[[2]]
D_Replication = tmp_d[[1]]

df_Disc = rbind(A_Discovery, B_Discovery, C_Discovery, D_Discovery)
df_Rep = rbind(A_Replication, B_Replication, C_Replication, D_Replication)

a = table(df_Disc$Schedule, df_Disc$Centre); print(a)
##    
##     CAMBRIDGE KINGS_COLLEGE NIJMEGEN UTRECHT
##   A         6            17       10       7
##   B         9            16       14       3
##   C         6             9       14       2
##   D         0            10       10       0
b = table(df_Rep$Schedule, df_Rep$Centre); print(b)
##    
##     CAMBRIDGE KINGS_COLLEGE NIJMEGEN UTRECHT
##   A         7            18       10       5
##   B         8            17       13       5
##   C         6            10       13       3
##   D         0             9        9       0
print(a-b)
##    
##     CAMBRIDGE KINGS_COLLEGE NIJMEGEN UTRECHT
##   A        -1            -1        0       2
##   B         1            -1        1      -2
##   C         0            -1        1      -1
##   D         0             1        1       0
print(sum(rowSums(abs(a-b))))
## [1] 14
data2write$dataset = NA
mask  = is.element(data2write$subid,df_Disc$subid)
data2write[mask,"dataset"] = "Discovery"
mask  = is.element(data2write$subid,df_Rep$subid)
data2write[mask,"dataset"] = "Replication"

asd_Disc = subset(data2write, data2write$dataset=="Discovery" & data2write$Diagnosis=="ASD")
asd_Rep = subset(data2write, data2write$dataset=="Replication" & data2write$Diagnosis=="ASD")

# find which seed gives best TD age-match -------------------------------------
seeds = 1:1000
pvals = data.frame(matrix(nrow = length(seeds), ncol = 2))
for (i in 1:length(seeds)) {
  res = findTDAgeMatch(data2write, seed_range = c(seeds[i],seeds[i]))
  td_Disc_matched = res[[2]]
  td_Rep_matched = res[[1]]
  tres = t.test(td_Disc_matched$age, asd_Disc$age)
  pvals[i,1] = tres$p.value
  tres = t.test(td_Rep_matched$age, asd_Rep$age)
  pvals[i,2] = tres$p.value
  #print(i)
}
a = sort.int(pvals[,1], decreasing = TRUE, index.return = TRUE)
b=pvals[a$ix,]

seed2use = 929
res = findTDAgeMatch(data2write, seed_range = c(seed2use,seed2use))
td_Disc_matched = res[[2]]
td_Rep_matched = res[[1]]

mask = is.element(data2write$subid, td_Disc_matched$subid)
data2write$dataset[mask] = "Discovery"

mask = is.element(data2write$subid, td_Rep_matched$subid)
data2write$dataset[mask] = "Replication"

print(table(data2write$dataset, data2write$subgrp))
##              
##               RRB_over_SC SC_equal_RRB SC_over_RRB  TD
##   Discovery             6           80          47 121
##   Replication           1           85          47 122
fname2save = here(sprintf("asd_subgrp_data_rsfmri_ALL_DSM5_diffzscoreGrps_z%s.csv",as.character(z_thresh)))
write.csv(data2write,file = fname2save)

Plot mean FD and hypothesis test

df2use = subset(data2write, data2write$subgrp!="RRB_over_SC")

p = ggplot(data = df2use, aes(x = subgrp, y =meanFD, colour = subgrp)) + facet_wrap(. ~ dataset)
p = p + geom_jitter() + geom_boxplot(fill = NA, colour = "#000000", outlier.shape = NA) + guides(colour = FALSE)
p = p + xlab("Group") + ylab("Mean FD")
p

# Hypothesis test
df2use = data2write
y_var = "meanFD"
# construct linear model
# mixed-effect model: site as random factor, all other covariates as fixed factors
fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
rx_form = as.formula(sprintf("~ 1|%s","Centre"))
mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df2use, na.action = na.omit)))

# run ANOVA
res = anova(mod2use)
res
##             numDF denDF   F-value p-value
## (Intercept)     1   501 119.30361  <.0001
## subgrp          3   501   1.25202  0.2903
# Discovery
df4mod = subset(df2use,df2use$dataset=="Discovery" & df2use$subgrp!="RRBoverSC")
# construct linear model
# mixed-effect model: site as random factor, all other covariates as fixed factors
fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
rx_form = as.formula(sprintf("~ 1|%s","Centre"))
mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))

# run ANOVA
res = anova(mod2use)
res
##             numDF denDF   F-value p-value
## (Intercept)     1   246 144.45663  <.0001
## subgrp          3   246   2.03336  0.1098
# Replication
df4mod = subset(df2use,df2use$dataset=="Replication" & df2use$subgrp!="RRBoverSC")
# construct linear model
# mixed-effect model: site as random factor, all other covariates as fixed factors
fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
rx_form = as.formula(sprintf("~ 1|%s","Centre"))
mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))

# run ANOVA
res = anova(mod2use)
res
##             numDF denDF  F-value p-value
## (Intercept)     1   247 68.34936  <.0001
## subgrp          3   247  0.09783  0.9612

Plot ADOS scores in the subtypes and also run a hypothesis test

df2use = subset(data2write, data2write$subgrp!="TD")
df2use$ados_2_SA_CSS[df2use$ados_2_SA_CSS==999] = NA
df2use$ados_2_RRB_CSS[df2use$ados_2_RRB_CSS==999] = NA

df4plot = melt(df2use,
               id.vars = c("subid","subgrp"), 
               measure.vars = c("ados_2_SA_CSS","ados_2_RRB_CSS"))
df4plot$variable = as.character(df4plot$variable)
df4plot$variable[df4plot$variable=="ados_2_SA_CSS"] = "SA"
df4plot$variable[df4plot$variable=="ados_2_RRB_CSS"] = "RRB"
df4plot$variable = factor(df4plot$variable)

p = ggplot(data = df4plot, aes(x = variable, 
                               y = value, 
                               colour = subgrp, 
                               group = subid)) + facet_grid(. ~ subgrp)
p = p + geom_point(shape=1) + geom_line(alpha = 0.2) + guides(color=FALSE)
p = p + ylab("ADOS CSS") + xlab("ADOS Subscale")
ggsave(filename = file.path(plotpath, sprintf("final_EUAIMS_ADOS_jitterplot_z%s.pdf",as.character(z_thresh))))
p

# hypothsis test on ADOS SA CSS
y_var = "ados_2_SA_CSS"
df2use = subset(data2write, !is.element(data2write$subgrp,c("TD")))

# Hypothesis test
# construct linear model
# mixed-effect model: site as random factor, all other covariates as fixed factors
fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
rx_form = as.formula(sprintf("~ 1|%s","Centre"))
mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df2use, na.action = na.omit)))

# run ANOVA
res = anova(mod2use)
res
##             numDF denDF  F-value p-value
## (Intercept)     1   260 8.555833  0.0037
## subgrp          2   260 1.392833  0.2502
# hypothsis test on ADOS RRB CSS
y_var = "ados_2_RRB_CSS"
df2use = subset(data2write, !is.element(data2write$subgrp,c("TD")))

# Hypothesis test
# construct linear model
# mixed-effect model: site as random factor, all other covariates as fixed factors
fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
rx_form = as.formula(sprintf("~ 1|%s","Centre"))
mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df2use, na.action = na.omit)))

# run ANOVA
res = anova(mod2use)
res
##             numDF denDF  F-value p-value
## (Intercept)     1   260 8.664999  0.0035
## subgrp          2   260 1.343152  0.2628

Descriptive stats

df2use = data2write[,c("subid","Diagnosis","dataset","Centre","meanFD","age","sex","subgrp","viq_all","piq_all","fsiq4_all","ados_2_SA_CSS","ados_2_RRB_CSS","SRS_tscore","SRS_tscore_self")]
mask = df2use==999
df2use[mask] = NA
df2use$age = df2use$age/365

cols2use = c("dataset","subgrp","age","meanFD","viq_all","piq_all","fsiq4_all","ados_2_SA_CSS","ados_2_RRB_CSS","SRS_tscore","SRS_tscore_self")
res=describeBy(x = df2use[,cols2use], group = c("subgrp"))
res
## 
##  Descriptive statistics by group 
## group: RRB_over_SC
##                 vars n   mean    sd median trimmed   mad   min    max range
## dataset*           1 7    NaN    NA     NA     NaN    NA   Inf   -Inf  -Inf
## subgrp*            2 7    NaN    NA     NA     NaN    NA   Inf   -Inf  -Inf
## age                3 7  17.13  6.97  20.96   17.13  4.34  7.89  23.88 16.00
## meanFD             4 7   0.31  0.38   0.18    0.31  0.14  0.06   1.14  1.09
## viq_all            5 7 108.41 21.28 110.00  108.41 20.76 78.00 143.00 65.00
## piq_all            6 7 109.28 19.70 107.00  109.28 11.86 87.00 148.00 61.00
## fsiq4_all          7 7 109.14 21.72 110.00  109.14 20.76 80.00 148.00 68.00
## ados_2_SA_CSS      8 7   4.00  3.11   3.00    4.00  2.97  1.00   9.00  8.00
## ados_2_RRB_CSS     9 7   4.29  4.11   1.00    4.29  0.00  1.00   9.00  8.00
## SRS_tscore        10 4  68.50 14.73  63.00   68.50  5.93 58.00  90.00 32.00
## SRS_tscore_self   11 4  59.00  7.12  61.00   59.00  4.45 49.00  65.00 16.00
##                  skew kurtosis   se
## dataset*           NA       NA   NA
## subgrp*            NA       NA   NA
## age             -0.25    -2.08 2.64
## meanFD           1.39     0.27 0.15
## viq_all          0.19    -1.33 8.04
## piq_all          0.82    -0.59 7.44
## fsiq4_all        0.42    -1.03 8.21
## ados_2_SA_CSS    0.43    -1.65 1.18
## ados_2_RRB_CSS   0.24    -2.18 1.55
## SRS_tscore       0.64    -1.77 7.37
## SRS_tscore_self -0.50    -1.88 3.56
## ------------------------------------------------------------ 
## group: SC_equal_RRB
##                 vars   n   mean    sd median trimmed   mad   min    max range
## dataset*           1 165    NaN    NA     NA     NaN    NA   Inf   -Inf  -Inf
## subgrp*            2 165    NaN    NA     NA     NaN    NA   Inf   -Inf  -Inf
## age                3 165  16.65  5.80  16.07   16.40  6.39  7.08  30.28 23.20
## meanFD             4 165   0.26  0.36   0.18    0.20  0.11  0.03   3.95  3.92
## viq_all            5 163 100.66 17.91 102.65  101.24 17.27 61.00 140.91 79.91
## piq_all            6 163 102.20 20.09 105.00  103.21 19.27 52.00 150.00 98.00
## fsiq4_all          7 164 101.61 17.72 105.00  102.52 18.16 60.00 143.00 83.00
## ados_2_SA_CSS      8 162   5.90  2.54   6.00    5.96  2.97  1.00  10.00  9.00
## ados_2_RRB_CSS     9 162   4.83  2.68   5.00    4.80  2.97  1.00  10.00  9.00
## SRS_tscore        10 147  68.92 12.44  69.00   69.01 14.83 43.00  95.00 52.00
## SRS_tscore_self   11  80  62.17 10.34  61.50   61.61  9.64 43.00  94.00 51.00
##                  skew kurtosis   se
## dataset*           NA       NA   NA
## subgrp*            NA       NA   NA
## age              0.34    -0.69 0.45
## meanFD           7.03    63.36 0.03
## viq_all         -0.27    -0.68 1.40
## piq_all         -0.39    -0.33 1.57
## fsiq4_all       -0.40    -0.57 1.38
## ados_2_SA_CSS   -0.22    -0.96 0.20
## ados_2_RRB_CSS  -0.29    -1.03 0.21
## SRS_tscore      -0.03    -0.90 1.03
## SRS_tscore_self  0.60     0.55 1.16
## ------------------------------------------------------------ 
## group: SC_over_RRB
##                 vars  n  mean    sd median trimmed   mad   min    max range
## dataset*           1 94   NaN    NA     NA     NaN    NA   Inf   -Inf  -Inf
## subgrp*            2 94   NaN    NA     NA     NaN    NA   Inf   -Inf  -Inf
## age                3 94 15.84  4.76  15.51   15.63  5.09  7.78  29.23 21.45
## meanFD             4 94  0.24  0.23   0.16    0.19  0.12  0.04   1.31  1.26
## viq_all            5 90 93.85 18.33  96.00   94.24 16.89 50.91 142.00 91.09
## piq_all            6 92 96.77 21.17  99.50   97.93 20.02 44.03 138.00 93.97
## fsiq4_all          7 92 95.90 19.11  99.00   96.35 19.68 59.00 139.00 80.00
## ados_2_SA_CSS      8 89  6.36  2.77   7.00    6.49  2.97  1.00  10.00  9.00
## ados_2_RRB_CSS     9 89  4.70  2.79   5.00    4.60  2.97  1.00  10.00  9.00
## SRS_tscore        10 81 76.06  9.53  77.00   76.49 11.86 54.00  90.00 36.00
## SRS_tscore_self   11 39 62.77 10.36  62.00   62.55 10.38 40.00  89.00 49.00
##                  skew kurtosis   se
## dataset*           NA       NA   NA
## subgrp*            NA       NA   NA
## age              0.44    -0.37 0.49
## meanFD           2.47     6.61 0.02
## viq_all         -0.15    -0.15 1.93
## piq_all         -0.46    -0.52 2.21
## fsiq4_all       -0.22    -0.80 1.99
## ados_2_SA_CSS   -0.37    -1.08 0.29
## ados_2_RRB_CSS  -0.16    -1.16 0.30
## SRS_tscore      -0.28    -0.80 1.06
## SRS_tscore_self  0.29     0.18 1.66
## ------------------------------------------------------------ 
## group: TD
##                 vars   n   mean    sd median trimmed   mad   min    max  range
## dataset*           1 243    NaN    NA     NA     NaN    NA   Inf   -Inf   -Inf
## subgrp*            2 243    NaN    NA     NA     NaN    NA   Inf   -Inf   -Inf
## age                3 243  16.84  5.66  16.58   16.63  6.57  6.89  29.84  22.95
## meanFD             4 243   0.20  0.34   0.13    0.15  0.07  0.03   4.60   4.58
## viq_all            5 241 104.27 18.62 106.00  105.33 14.83 45.00 160.00 115.00
## piq_all            6 241 105.36 18.92 107.00  107.07 16.31 49.00 147.00  98.00
## fsiq4_all          7 241 105.33 17.70 108.18  107.16 13.61 50.00 142.00  92.00
## ados_2_SA_CSS      8   0    NaN    NA     NA     NaN    NA   Inf   -Inf   -Inf
## ados_2_RRB_CSS     9   0    NaN    NA     NA     NaN    NA   Inf   -Inf   -Inf
## SRS_tscore        10 133  47.54  9.34  44.00   45.82  4.45 37.00  90.00  53.00
## SRS_tscore_self   11 132  47.50  5.90  46.00   46.72  5.19 39.00  69.00  30.00
##                  skew kurtosis   se
## dataset*           NA       NA   NA
## subgrp*            NA       NA   NA
## age              0.28    -0.74 0.36
## meanFD           9.47   111.25 0.02
## viq_all         -0.55     0.92 1.20
## piq_all         -0.80     0.53 1.22
## fsiq4_all       -0.96     1.13 1.14
## ados_2_SA_CSS      NA       NA   NA
## ados_2_RRB_CSS     NA       NA   NA
## SRS_tscore       1.85     3.60 0.81
## SRS_tscore_self  1.18     1.39 0.51

Model age differences

y_var = "age"

# df4mod = subset(df2use,df2use$subgrp!="RRBoverSC")
df4mod = df2use
# construct linear model
# mixed-effect model: site as random factor, all other covariates as fixed factors
fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
rx_form = as.formula(sprintf("~ 1|%s","Centre"))
mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))

# run ANOVA
res = anova(mod2use)
res
##             numDF denDF  F-value p-value
## (Intercept)     1   501 775.7083  <.0001
## subgrp          3   501   1.1851  0.3148
# Discovery
df4mod = subset(df2use,df2use$dataset=="Discovery" & df2use$subgrp!="RRBoverSC")
# construct linear model
# mixed-effect model: site as random factor, all other covariates as fixed factors
fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
rx_form = as.formula(sprintf("~ 1|%s","Centre"))
mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))

# run ANOVA
res = anova(mod2use)
res
##             numDF denDF  F-value p-value
## (Intercept)     1   246 575.3601  <.0001
## subgrp          3   246   0.7817  0.5052
# Replication
df4mod = subset(df2use,df2use$dataset=="Replication" & df2use$subgrp!="RRBoverSC")
# construct linear model
# mixed-effect model: site as random factor, all other covariates as fixed factors
fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
rx_form = as.formula(sprintf("~ 1|%s","Centre"))
mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))

# run ANOVA
res = anova(mod2use)
res
##             numDF denDF  F-value p-value
## (Intercept)     1   247 500.7262  <.0001
## subgrp          3   247   0.7792  0.5066

Model VIQ differences

y_var = "viq_all"

# df4mod = subset(df2use,df2use$subgrp!="RRBoverSC")
df4mod = df2use
# construct linear model
# mixed-effect model: site as random factor, all other covariates as fixed factors
fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
rx_form = as.formula(sprintf("~ 1|%s","Centre"))
mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))

# run ANOVA
res = anova(mod2use)
res
##             numDF denDF   F-value p-value
## (Intercept)     1   493 1990.6847  <.0001
## subgrp          3   493    6.2928   3e-04
# Discovery
df4mod = subset(df2use,df2use$dataset=="Discovery" & df2use$subgrp!="RRBoverSC")
# construct linear model
# mixed-effect model: site as random factor, all other covariates as fixed factors
fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
rx_form = as.formula(sprintf("~ 1|%s","Centre"))
mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))

# run ANOVA
res = anova(mod2use)
res
##             numDF denDF   F-value p-value
## (Intercept)     1   242 2246.6324  <.0001
## subgrp          3   242    2.5051  0.0598
# Replication
df4mod = subset(df2use,df2use$dataset=="Replication" & df2use$subgrp!="RRBoverSC")
# construct linear model
# mixed-effect model: site as random factor, all other covariates as fixed factors
fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
rx_form = as.formula(sprintf("~ 1|%s","Centre"))
mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))

# run ANOVA
res = anova(mod2use)
res
##             numDF denDF   F-value p-value
## (Intercept)     1   243 1553.0125  <.0001
## subgrp          3   243    6.1755   5e-04

Model PIQ differences

y_var = "piq_all"

# df4mod = subset(df2use,df2use$subgrp!="RRBoverSC")
df4mod = df2use
# construct linear model
# mixed-effect model: site as random factor, all other covariates as fixed factors
fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
rx_form = as.formula(sprintf("~ 1|%s","Centre"))
mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))

# run ANOVA
res = anova(mod2use)
res
##             numDF denDF   F-value p-value
## (Intercept)     1   495 1501.1038  <.0001
## subgrp          3   495    2.7043  0.0449
# Discovery
df4mod = subset(df2use,df2use$dataset=="Discovery" & df2use$subgrp!="RRBoverSC")
# construct linear model
# mixed-effect model: site as random factor, all other covariates as fixed factors
fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
rx_form = as.formula(sprintf("~ 1|%s","Centre"))
mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))

# run ANOVA
res = anova(mod2use)
res
##             numDF denDF   F-value p-value
## (Intercept)     1   242 1866.9357  <.0001
## subgrp          3   242    1.6548  0.1774
# Replication
df4mod = subset(df2use,df2use$dataset=="Replication" & df2use$subgrp!="RRBoverSC")
# construct linear model
# mixed-effect model: site as random factor, all other covariates as fixed factors
fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
rx_form = as.formula(sprintf("~ 1|%s","Centre"))
mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))

# run ANOVA
res = anova(mod2use)
res
##             numDF denDF   F-value p-value
## (Intercept)     1   245 1289.4304  <.0001
## subgrp          3   245    3.3391    0.02

Model FIQ differences

y_var = "fsiq4_all"

# df4mod = subset(df2use,df2use$subgrp!="RRBoverSC")
df4mod = df2use
# construct linear model
# mixed-effect model: site as random factor, all other covariates as fixed factors
fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
rx_form = as.formula(sprintf("~ 1|%s","Centre"))
mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))

# run ANOVA
res = anova(mod2use)
res
##             numDF denDF   F-value p-value
## (Intercept)     1   496 2106.2315  <.0001
## subgrp          3   496    4.5545  0.0037
# Discovery
df4mod = subset(df2use,df2use$dataset=="Discovery" & df2use$subgrp!="RRBoverSC")
# construct linear model
# mixed-effect model: site as random factor, all other covariates as fixed factors
fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
rx_form = as.formula(sprintf("~ 1|%s","Centre"))
mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))

# run ANOVA
res = anova(mod2use)
res
##             numDF denDF   F-value p-value
## (Intercept)     1   244 2802.4473  <.0001
## subgrp          3   244    2.4532  0.0639
# Replication
df4mod = subset(df2use,df2use$dataset=="Replication" & df2use$subgrp!="RRBoverSC")
# construct linear model
# mixed-effect model: site as random factor, all other covariates as fixed factors
fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
rx_form = as.formula(sprintf("~ 1|%s","Centre"))
mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))

# run ANOVA
res = anova(mod2use)
res
##             numDF denDF   F-value p-value
## (Intercept)     1   244 1611.7733  <.0001
## subgrp          3   244    4.7892  0.0029